RNA-seq analysis
# Load libraries and custom functions -----------------------------------------
library(easypackages)
libraries("here","ggplot2","WGCNA","limma","statmod","edgeR","biomaRt",
"sva","patchwork","Biobase","reshape2","genefilter","variancePartition",
"BiocParallel","tidyverse","gplots","enrichR","ggrepel","glue","ggtext")
source(here("code","utils.R"))
# Various parameters to set ---------------------------------------------------
# set FDR thresh
fdr_qThresh = 0.05
minReads = 100
varianceFilterCutoff = 0.15
nsv = 2
# Paths -----------------------------------------------------------------------
# paths
root_path = here()
data_path = here("data")
code_path = here("code")
plot_path = here("plots")
result_path = here("results")
res_path = file.path(result_path,"filt","sva")
# load data -------------------------------------------------------------------
# read count data
reads_file = file.path(data_path,"tidy_reads.csv")
reads_data = read.csv(reads_file)
# sample meta data
meta_file = file.path(data_path,"tidy_metadata.csv")
meta_data = read.csv(meta_file)
colnames(reads_data)[4:ncol(reads_data)] = meta_data$sample_id
# mouse gene meta data - from utils::getGeneMetaData
gene_metadata_file = file.path(data_path,"tidy_gene_meta_data.csv")
gene_meta_data = read.csv(gene_metadata_file,row.names = 1)
# mouse to human homolog gene meta data - from utils::getHumanHomolog
hsap_homolog_metadata_file = file.path(data_path,"tidy_human_homolog_gene_meta_data.csv")
hsap_homolog_gene_meta_data = read.csv(hsap_homolog_metadata_file)
# set up expression and meta data matrices ------------------------------------
# PFC data
pfc_names = meta_data$sample_id[meta_data$brain_region=="PFC"]
exprs_pfc = reads_data[,pfc_names]
rownames(exprs_pfc) = reads_data$ensembl_id
meta_data_pfc = subset(meta_data, meta_data$brain_region=="PFC")
# HYT
hyt_names = meta_data$sample_id[meta_data$brain_region=="HYT"]
exprs_hyt = reads_data[,hyt_names]
rownames(exprs_hyt) = reads_data$ensembl_id
meta_data_hyt = subset(meta_data, meta_data$brain_region=="HYT")
Preprocessing
### PCA to summarize the Picard sequencing variables --------------------------
npcs2use = 10
seq_vars2use = c("PCT_CODING_BASES","PCT_UTR_BASES","PCT_INTRONIC_BASES",
"PCT_INTERGENIC_BASES","MEDIAN_CV_COVERAGE","MEDIAN_5PRIME_BIAS",
"ALIGNED_READS","AT_DROPOUT")
# PFC data
pca_res_seq_pfc = prcomp(meta_data_pfc[,seq_vars2use], center = TRUE, scale = TRUE)
meta_data_pfc = cbind(meta_data_pfc, pca_res_seq_pfc$x)
summary(pca_res_seq_pfc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8701 1.2636 1.2411 0.89062 0.71602 0.1980 0.13927
## Proportion of Variance 0.4372 0.1996 0.1925 0.09915 0.06409 0.0049 0.00242
## Cumulative Proportion 0.4372 0.6368 0.8293 0.92846 0.99254 0.9974 0.99987
## PC8
## Standard deviation 0.03253
## Proportion of Variance 0.00013
## Cumulative Proportion 1.00000
# HYT data
pca_res_seq_hyt = prcomp(meta_data_hyt[,seq_vars2use], center = TRUE, scale = TRUE)
meta_data_hyt = cbind(meta_data_hyt, pca_res_seq_hyt$x)
summary(pca_res_seq_hyt)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.778 1.4909 1.0176 0.8314 0.80445 0.46944 0.1522
## Proportion of Variance 0.395 0.2778 0.1294 0.0864 0.08089 0.02755 0.0029
## Cumulative Proportion 0.395 0.6728 0.8023 0.8887 0.96955 0.99710 1.0000
## PC8
## Standard deviation 0.005957
## Proportion of Variance 0.000000
## Cumulative Proportion 1.000000
### Filtering ------------------------------------------------------------------
# PFC only
genes2keep = rowSums(exprs_pfc>minReads)>=2
# genes to throw out
tossedOutData = exprs_pfc[!genes2keep,]
# grab only genes to keep
exprs_filt_pfc = exprs_pfc[genes2keep,]
sprintf("Genes before filtering out low reads = %d",dim(exprs_pfc)[1])
## [1] "Genes before filtering out low reads = 54165"
sprintf("Genes after filtering out low reads = %d",dim(exprs_filt_pfc)[1])
## [1] "Genes after filtering out low reads = 14506"
sprintf("Max reads for any gene tossed out = %d",max(as.matrix(tossedOutData)))
## [1] "Max reads for any gene tossed out = 212"
sprintf("Mean reads for any gene tossed out = %f",mean(as.matrix(tossedOutData)))
## [1] "Mean reads for any gene tossed out = 4.012234"
sprintf("Median reads for any gene tossed out = %f",median(as.matrix(tossedOutData)))
## [1] "Median reads for any gene tossed out = 0.000000"
genes_before = dim(exprs_filt_pfc)[1]
exprs_filt_pfc = varFilter(as.matrix(exprs_filt_pfc), var.cutoff = varianceFilterCutoff)
genes_after = dim(exprs_filt_pfc)[1]
print(sprintf("Genes before variance filtering at %.02f = %d",varianceFilterCutoff,genes_before))
## [1] "Genes before variance filtering at 0.15 = 14506"
print(sprintf("Genes after variance filtering at %.02f = %d",varianceFilterCutoff,genes_after))
## [1] "Genes after variance filtering at 0.15 = 12294"
# HYT only
genes2keep = rowSums(exprs_hyt>minReads)>=2
# genes to throw out
tossedOutData = exprs_hyt[!genes2keep,]
# grab only genes to keep
exprs_filt_hyt = exprs_hyt[genes2keep,]
sprintf("Genes before filtering out low reads = %d",dim(exprs_hyt)[1])
## [1] "Genes before filtering out low reads = 54165"
sprintf("Genes after filtering out low reads = %d",dim(exprs_filt_hyt)[1])
## [1] "Genes after filtering out low reads = 16155"
sprintf("Max reads for any gene tossed out = %d",max(as.matrix(tossedOutData)))
## [1] "Max reads for any gene tossed out = 643"
sprintf("Mean reads for any gene tossed out = %f",mean(as.matrix(tossedOutData)))
## [1] "Mean reads for any gene tossed out = 4.337044"
sprintf("Median reads for any gene tossed out = %f",median(as.matrix(tossedOutData)))
## [1] "Median reads for any gene tossed out = 0.000000"
genes_before = dim(exprs_filt_hyt)[1]
exprs_filt_hyt = varFilter(as.matrix(exprs_filt_hyt), var.cutoff = varianceFilterCutoff)
genes_after = dim(exprs_filt_hyt)[1]
print(sprintf("Genes before variance filtering at %.02f = %d",varianceFilterCutoff,genes_before))
## [1] "Genes before variance filtering at 0.15 = 16155"
print(sprintf("Genes after variance filtering at %.02f = %d",varianceFilterCutoff,genes_after))
## [1] "Genes after variance filtering at 0.15 = 13727"
### Scale data by library size ------------------------------------------------
# turn into DGEList object
dge_pfc = DGEList(counts=exprs_filt_pfc)
dge_hyt = DGEList(counts=exprs_filt_hyt)
# Calculate normalization factors to scale the raw library sizes.
dgeN_pfc = calcNormFactors(dge_pfc, method="TMM")
dgeN_hyt = calcNormFactors(dge_hyt, method="TMM")
### Run voom to get log-CPM and precision weights for DE model ----------------
# PFC only --------------------------------------------------------------------
# make design matrix for modeling with voom
form2use = ~ group*sex + RIN
design_pfc = model.matrix(form2use, data = meta_data_pfc)
# use voom to calculate precision weights plus output count data as logCPM
v_pfc = voom(dgeN_pfc, design_pfc, plot=TRUE)

voomed_edata_pfc = v_pfc$E
voomed_precision_weights_pfc = v_pfc$weights
# HYT only --------------------------------------------------------------------
# make design matrix for modeling with voom
form2use = ~ group*sex + RIN
design_hyt = model.matrix(form2use, data = meta_data_hyt)
# use voom to calculate precision weights plus output count data as logCPM
v_hyt = voom(dgeN_hyt, design_hyt, plot=TRUE)

voomed_edata_hyt = v_hyt$E
voomed_precision_weights_hyt = v_hyt$weights
### Plot final preprocessed data ----------------------------------------------
# PFC plot data
x = melt(as.matrix(v_pfc$E))
colnames(x) = c('gene_name', 'sample_id', 'value')
ggplot(x, aes(x=value, color=sample_id)) + geom_density()

# HYT plot data
x = melt(as.matrix(v_hyt$E))
colnames(x) = c('gene_name', 'sample_id', 'value')
ggplot(x, aes(x=value, color=sample_id)) + geom_density()

### PCA on preprocessed expression data ---------------------------------------
# PFC -------------------------------------------------------------------------
pca_expr_pfc = prcomp(t(v_pfc$E), center = TRUE, scale = TRUE)
summary(pca_expr_pfc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 67.3607 45.0851 35.1122 29.41617 24.55718 23.05712
## Proportion of Variance 0.3691 0.1653 0.1003 0.07038 0.04905 0.04324
## Cumulative Proportion 0.3691 0.5344 0.6347 0.70508 0.75414 0.79738
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 19.89234 18.68826 18.33707 17.32292 13.7168 13.0248
## Proportion of Variance 0.03219 0.02841 0.02735 0.02441 0.0153 0.0138
## Cumulative Proportion 0.82957 0.85798 0.88533 0.90973 0.9250 0.9388
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 12.25960 11.50353 10.54770 10.03052 9.93079 9.23555
## Proportion of Variance 0.01223 0.01076 0.00905 0.00818 0.00802 0.00694
## Cumulative Proportion 0.95106 0.96183 0.97088 0.97906 0.98708 0.99402
## PC19 PC20
## Standard deviation 8.57407 1.974e-13
## Proportion of Variance 0.00598 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
# HYT -------------------------------------------------------------------------
pca_expr_hyt = prcomp(t(v_hyt$E), center = TRUE, scale = TRUE)
summary(pca_expr_hyt)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 82.2350 43.7813 36.12868 28.92985 23.75485 22.4755
## Proportion of Variance 0.4926 0.1396 0.09509 0.06097 0.04111 0.0368
## Cumulative Proportion 0.4926 0.6323 0.72737 0.78834 0.82945 0.8662
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 17.94329 15.82675 14.89912 13.87575 12.7791 12.18479
## Proportion of Variance 0.02345 0.01825 0.01617 0.01403 0.0119 0.01082
## Cumulative Proportion 0.88971 0.90795 0.92413 0.93815 0.9500 0.96086
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 11.11135 10.9901 10.28565 9.97009 9.36854 2.175e-13
## Proportion of Variance 0.00899 0.0088 0.00771 0.00724 0.00639 0.000e+00
## Cumulative Proportion 0.96986 0.9787 0.98636 0.99361 1.00000 1.000e+00
### Surrogate Variable Analysis (SVA) -----------------------------------------
# basic model before SVA
svaform2use = ~ group*sex + RIN
# PFC
sva_res_pfc = runSVA(expr_data = v_pfc$E,
meta_data = meta_data_pfc,
form2use = svaform2use,
nsv = nsv,
svestMethod = "leek")
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
# new meta_data data frame with the SVs
meta_data_pfc = sva_res_pfc$meta_data
# new model to use later in DE model, which incorporates SVs
form4model_pfc = sva_res_pfc$form2use
# HYT
sva_res_hyt = runSVA(expr_data = v_hyt$E,
meta_data = meta_data_hyt,
form2use = svaform2use,
nsv = nsv,
svestMethod = "leek")
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
# new meta_data data frame with the SVs
meta_data_hyt = sva_res_hyt$meta_data
# new model to use later in DE model, which incorporates SVs
form4model_hyt = sva_res_hyt$form2use
### Plot heatmaps -------------------------------------------------------------
# PFC
# plot correlations of PCA of expression data against PCA of sequencing-related variables
r_mat = cor(pca_expr_pfc$x, pca_res_seq_pfc$x)
heatmap.2(r_mat,
col = blueWhiteRed(50),
Rowv = TRUE,
trace = "none",
cellnote= round(as.matrix(r_mat),digits = 2),
notecol = "black",
notecex=0.9,
density.info = "none",
key.xlab ="Correlation (r)")

print(r_mat)
## PC1 PC2 PC3 PC4 PC5
## PC1 -0.932530344 0.206630234 0.016102017 -0.11210853 0.07612601
## PC2 -0.197570167 -0.394090715 -0.096729275 -0.52226418 0.06251997
## PC3 -0.108667842 -0.728316401 0.410460491 0.03780872 -0.18791135
## PC4 -0.031634980 -0.278681139 -0.837069125 -0.06793425 -0.17243203
## PC5 0.083252803 0.019612289 0.196228354 -0.35416420 -0.51334731
## PC6 -0.012317975 0.239402929 -0.060505908 -0.07705545 -0.60396481
## PC7 0.156028198 0.260859904 0.078149293 -0.57357412 0.07723189
## PC8 -0.013290741 0.066576191 -0.046141700 -0.09895332 -0.16874454
## PC9 -0.130492844 -0.006894644 0.148871305 0.17870935 0.02934789
## PC10 0.007680893 -0.081190818 0.093785108 -0.14244978 -0.02599418
## PC11 0.017728460 -0.070512653 -0.102768376 0.06906079 0.10878710
## PC12 -0.144109103 0.108047117 -0.003641237 0.28851615 -0.37338938
## PC13 -0.009526943 -0.145772386 0.004459942 0.02580745 -0.12655331
## PC14 0.030428558 -0.071917351 -0.041471137 0.05691040 0.12089484
## PC15 0.024580982 0.104299973 0.104495285 -0.09236672 -0.03431177
## PC16 -0.068858030 -0.029158501 -0.012454110 0.26206314 -0.22579187
## PC17 0.045150045 0.005761638 -0.009688141 -0.09688062 0.14573053
## PC18 0.013872361 0.024641643 0.095583356 0.00615551 -0.02527730
## PC19 -0.013308579 -0.040414263 -0.085098025 0.08775646 -0.07110561
## PC20 -0.575070009 0.440271494 0.222988608 0.21071857 -0.10951222
## PC6 PC7 PC8
## PC1 0.091982868 -0.089207144 0.188140339
## PC2 -0.111683955 0.394129879 -0.518584019
## PC3 0.171057341 -0.400181852 0.099736953
## PC4 -0.153960197 -0.217705885 0.214794728
## PC5 -0.398157620 0.006739672 0.213603847
## PC6 0.359580139 -0.034522400 -0.055037109
## PC7 0.165682132 -0.253410471 -0.152794869
## PC8 0.105776700 -0.110561434 -0.007013627
## PC9 -0.578568989 -0.017525376 -0.093255895
## PC10 0.166320380 0.091444607 0.157405954
## PC11 0.147382328 -0.180079910 -0.123893927
## PC12 -0.134577806 -0.052025825 -0.444696626
## PC13 0.201340415 0.572037855 0.241181685
## PC14 0.330525732 -0.044068921 -0.164972351
## PC15 -0.155537558 -0.145752001 -0.015556238
## PC16 0.138191885 0.083032915 -0.306982124
## PC17 -0.052988134 -0.271617818 0.073204129
## PC18 -0.009118173 0.262035016 0.359289870
## PC19 0.037848771 0.089071026 0.081534010
## PC20 0.090215664 -0.244510172 0.126172966
# plot correlations of PCA of expression data against SVs from SVA
r_mat = cor(pca_expr_pfc$x, sva_res_pfc$meta_data[,sva_res_pfc$sv_names])
heatmap.2(r_mat,
col = blueWhiteRed(50),
Rowv = TRUE,
trace = "none",
cellnote= round(as.matrix(r_mat),digits = 2),
notecol = "black",
notecex=0.9,
density.info = "none",
key.xlab ="Correlation (r)")

print(r_mat)
## sv001 sv002
## PC1 -0.965997716 5.726108e-02
## PC2 -0.233003558 -5.155497e-01
## PC3 -0.076031693 7.803486e-01
## PC4 -0.037159016 -1.420795e-01
## PC5 0.049928542 -2.523472e-01
## PC6 0.010249353 -6.459147e-02
## PC7 -0.019557375 7.776587e-02
## PC8 -0.013515837 -1.358480e-01
## PC9 -0.011993882 -7.442180e-03
## PC10 -0.025400560 -6.370417e-02
## PC11 0.015998992 -2.426434e-02
## PC12 -0.027195275 -1.424549e-02
## PC13 -0.003551633 4.739788e-02
## PC14 -0.004703281 -2.630655e-06
## PC15 -0.008238460 -1.547988e-02
## PC16 -0.013855760 1.160439e-03
## PC17 -0.010070489 -4.062048e-02
## PC18 0.006465747 1.153551e-02
## PC19 0.003265470 1.642619e-02
## PC20 -0.550062806 2.239853e-01
# plot correlations of PCA of sequencing-related variabales against SVs from SVA
r_mat = cor(pca_res_seq_pfc$x, sva_res_pfc$meta_data[,sva_res_pfc$sv_names])
heatmap.2(r_mat,
col = blueWhiteRed(50),
Rowv = TRUE,
trace = "none",
cellnote= round(as.matrix(r_mat),digits = 2),
notecol = "black",
notecex=0.9,
density.info = "none",
key.xlab ="Correlation (r)")

print(r_mat)
## sv001 sv002
## PC1 0.963259621 -0.03881844
## PC2 -0.046169244 -0.32668724
## PC3 0.009368594 0.45090031
## PC4 0.216524137 0.37554106
## PC5 -0.084828926 0.03848017
## PC6 -0.084553430 0.30079839
## PC7 0.038657053 -0.44940866
## PC8 -0.049907205 0.27799831
# HYT
# plot correlations of PCA of expression data against PCA of sequencing-related variables
r_mat = cor(pca_expr_hyt$x, pca_res_seq_hyt$x)
heatmap.2(r_mat,
col = blueWhiteRed(50),
Rowv = TRUE,
trace = "none",
cellnote= round(as.matrix(r_mat),digits = 2),
notecol = "black",
notecex=0.9,
density.info = "none",
key.xlab ="Correlation (r)")

print(r_mat)
## PC1 PC2 PC3 PC4 PC5
## PC1 0.784704716 0.413463608 0.223287556 0.099196015 0.29934726
## PC2 0.521416651 -0.379305442 0.006808786 -0.500834841 -0.29793439
## PC3 0.165532526 -0.626684706 0.009154236 0.575401424 0.32243093
## PC4 0.068444213 -0.234702393 -0.243232631 -0.091239041 0.39549956
## PC5 0.225067608 -0.165458095 -0.412063055 0.164686318 -0.54422429
## PC6 -0.115176406 -0.288284416 0.284418614 -0.536105506 0.27964917
## PC7 0.032131498 -0.263848227 0.143968595 0.012377105 -0.12907961
## PC8 0.002385271 -0.073576046 -0.192517518 0.011236326 0.22034722
## PC9 -0.070660139 -0.078741957 0.578039001 0.215834425 -0.20350873
## PC10 0.069050904 0.121554726 -0.128354513 0.122868979 -0.08295568
## PC11 0.020229528 -0.133775436 0.040870258 -0.001287308 0.02573071
## PC12 0.033164722 -0.087562715 0.118598102 0.052409139 -0.13849559
## PC13 0.003660998 -0.000490014 -0.116909781 -0.034211956 -0.01844659
## PC14 0.023103740 -0.064013518 0.433233300 0.111890023 -0.21751614
## PC15 0.007158223 0.025768246 -0.038898264 -0.062434930 0.04593845
## PC16 -0.057584623 0.022550220 0.107528880 0.022698637 0.03584778
## PC17 0.011574642 0.005766781 -0.039951902 -0.050697752 0.04956847
## PC18 -0.777728317 -0.501256827 -0.216249777 -0.047290663 -0.09914285
## PC6 PC7 PC8
## PC1 -0.23493764 0.023157747 -0.02530176
## PC2 0.40268124 -0.061494252 0.18819895
## PC3 0.08414573 -0.205528609 -0.07214646
## PC4 0.24583874 0.440454521 -0.18830190
## PC5 -0.43437525 0.269980416 -0.24689257
## PC6 -0.39922334 -0.041317333 -0.27179056
## PC7 -0.42086345 -0.431752375 0.20022350
## PC8 -0.21553014 0.227359183 0.44582422
## PC9 0.07191374 0.385723230 0.16124718
## PC10 0.23975048 -0.314765384 -0.23666417
## PC11 -0.22711545 0.220207537 0.04242660
## PC12 -0.01107585 0.219718033 0.05527709
## PC13 -0.11134794 -0.124293977 -0.10536961
## PC14 0.11366350 0.058440097 -0.39925336
## PC15 -0.01081979 -0.127849959 -0.43023658
## PC16 -0.02510358 0.002605484 -0.08202267
## PC17 -0.05543935 0.258645421 -0.31685980
## PC18 0.10912374 -0.099229923 -0.02913442
# plot correlations of PCA of expression data against SVs from SVA
r_mat = cor(pca_expr_hyt$x, sva_res_hyt$meta_data[,sva_res_hyt$sv_names])
heatmap.2(r_mat,
col = blueWhiteRed(50),
Rowv = TRUE,
trace = "none",
cellnote= round(as.matrix(r_mat),digits = 2),
notecol = "black",
notecex=0.9,
density.info = "none",
key.xlab ="Correlation (r)")

print(r_mat)
## sv001 sv002
## PC1 -0.997150824 0.030535137
## PC2 0.001812948 0.903455208
## PC3 0.056456984 0.355569358
## PC4 0.021728877 0.115039195
## PC5 0.008066996 -0.097539838
## PC6 -0.040977115 -0.170289840
## PC7 -0.003390796 -0.038900968
## PC8 0.002521880 0.029303384
## PC9 0.005768762 -0.037221874
## PC10 -0.009130765 -0.005338418
## PC11 -0.003805584 -0.009525330
## PC12 -0.007659704 -0.013844508
## PC13 0.005537246 -0.011185879
## PC14 -0.002576028 -0.012085309
## PC15 -0.004246093 0.014031111
## PC16 0.004282247 -0.011103142
## PC17 0.001374117 0.000620804
## PC18 0.945246875 -0.045107331
# plot correlations of PCA of sequencing-related variabales against SVs from SVA
r_mat = cor(pca_res_seq_hyt$x, sva_res_hyt$meta_data[,sva_res_hyt$sv_names])
heatmap.2(r_mat,
col = blueWhiteRed(50),
Rowv = TRUE,
trace = "none",
cellnote= round(as.matrix(r_mat),digits = 2),
notecol = "black",
notecex=0.9,
density.info = "none",
key.xlab ="Correlation (r)")

print(r_mat)
## sv001 sv002
## PC1 -0.76592814 0.56028644
## PC2 -0.44249344 -0.50088918
## PC3 -0.24110988 -0.05979398
## PC4 -0.04649388 -0.19180828
## PC5 -0.28606224 -0.07014234
## PC6 0.25695695 0.53356455
## PC7 -0.01667900 -0.09189486
## PC8 0.03098848 0.19235500
PFC DE modeling
de_res_pfc = runDEmodel(expr_data = v_pfc, meta_data = meta_data_pfc,
form2use = form4model_pfc,
gene_meta_data = gene_meta_data)
# save tables
brain_region = "pfc"
fstem = sprintf("_filt%0.2f",varianceFilterCutoff)
table_names2use = c("Fstat","interaction","group","sex")
report2screen = sprintf("%s DE genes\n",brain_region)
for (itable in 1:length(table_names2use)){
fname2save = file.path(res_path,
brain_region,
sprintf("DE.%s_table%s.csv",
table_names2use[itable],
fstem))
tab2use = de_res_pfc[[sprintf("%s_table",table_names2use[itable])]]
tab2use = tab2use[order(tab2use$adj.P.Val),]
tab2use$DE = "NO"
tab2use$DE[tab2use$adj.P.Val<=fdr_qThresh] = "YES"
write.csv(tab2use, file = fname2save)
if (itable>1){
report2screen = c(report2screen,
sprintf("%s DE genes is %d\n",
table_names2use[itable],
sum(tab2use$adj.P.Val<=fdr_qThresh)))
}
}
cat(report2screen)
## pfc DE genes
## interaction DE genes is 2629
## group DE genes is 1
## sex DE genes is 2311
# plot percentage of DE genes on each chromosome ------------------------------
gene_meta_data_sub = subset(gene_meta_data,
is.element(gene_meta_data$ensembl_gene_id,
rownames(exprs_filt_pfc)))
# DE-
tmp_df = de_res_pfc[["interaction_table"]]
tmp_df = subset(tmp_df, tmp_df$adj.P.Val<fdr_qThresh & tmp_df$logFC>0)
chr_res = chromosome_perm(gene_meta_data_sub,
de_df = tmp_df,
nperm=10000,
fdr_qThresh=fdr_qThresh)
chr_res
## Chromosome percent pval fdr
## 1 1 0.16462585 0.00049995 0.00199980
## 10 10 0.13220339 0.22497750 0.48355164
## 11 11 0.08659794 0.99970003 1.00000000
## 12 12 0.13465784 0.20397960 0.48355164
## 13 13 0.18461538 0.00009999 0.00049995
## 14 14 0.15853659 0.01469853 0.04899510
## 15 15 0.07662083 0.99950005 1.00000000
## 16 16 0.13589744 0.19948005 0.48355164
## 17 17 0.08818342 0.99520048 1.00000000
## 18 18 0.21153846 0.00009999 0.00049995
## 19 19 0.08857809 0.98880112 1.00000000
## 2 2 0.11758794 0.66523348 1.00000000
## 3 3 0.17815126 0.00009999 0.00049995
## 4 4 0.10039630 0.97220278 1.00000000
## 5 5 0.10532838 0.93480652 1.00000000
## 6 6 0.13114754 0.24177582 0.48355164
## 7 7 0.07464325 1.00000000 1.00000000
## 8 8 0.08137715 0.99980002 1.00000000
## 9 9 0.10780142 0.88661134 1.00000000
## X X 0.20095694 0.00009999 0.00049995
fontSize=30
p = ggplot(data = chr_res, aes(x = reorder(Chromosome,percent), y = percent, fill=-log10(pval))) +
geom_bar(stat="identity") +
scale_fill_gradientn(colors = colorRampPalette(c("grey60","#ff8d1e"))(100),
limits = c(min(-log10(chr_res$pval)),max(-log10(chr_res$pval)))) +
geom_hline(yintercept = min(chr_res[chr_res$fdr<=fdr_qThresh,"percent"])-0.01, linetype="dashed") +
xlab("Chromosome") + ylab("Percentage DE genes") +
coord_flip() +
theme(
axis.text.x = element_text(size=fontSize+10),
axis.text.y = element_text(size=fontSize+10),
axis.title.x = element_text(size=fontSize+10),
strip.text.x = element_text(size=fontSize+10),
axis.title.y = element_text(size=fontSize+10))
ggsave(filename = file.path(plot_path, "DE-.interaction.pfc.chromosomePlot.pdf"),
width=18, height=12)
p

# DE+
tmp_df = de_res_pfc[["interaction_table"]]
tmp_df = subset(tmp_df, tmp_df$adj.P.Val<fdr_qThresh & tmp_df$logFC<0)
chr_res = chromosome_perm(gene_meta_data_sub,
de_df = tmp_df,
nperm=10000,
fdr_qThresh=fdr_qThresh)
chr_res
## Chromosome percent pval fdr
## 1 1 0.06394558 0.99820018 1.00000000
## 10 10 0.08474576 0.75882412 1.00000000
## 11 11 0.12371134 0.00039996 0.00399960
## 12 12 0.04415011 0.99990001 1.00000000
## 13 13 0.04175824 1.00000000 1.00000000
## 14 14 0.08048780 0.81831817 1.00000000
## 15 15 0.12573674 0.00659934 0.02199780
## 16 16 0.07435897 0.91040896 1.00000000
## 17 17 0.13051146 0.00169983 0.00849915
## 18 18 0.13461538 0.00399960 0.01599840
## 19 19 0.13752914 0.00079992 0.00533280
## 2 2 0.10351759 0.11088891 0.27722228
## 3 3 0.03529412 1.00000000 1.00000000
## 4 4 0.09247028 0.51014899 1.00000000
## 5 5 0.10904585 0.04979502 0.14227149
## 6 6 0.06721311 0.98920108 1.00000000
## 7 7 0.13721186 0.00009999 0.00199980
## 8 8 0.09546166 0.40415958 0.89813241
## 9 9 0.06950355 0.98760124 1.00000000
## X X 0.03110048 1.00000000 1.00000000
fontSize=30
p = ggplot(data = chr_res, aes(x = reorder(Chromosome,percent), y = percent, fill=-log10(pval))) +
geom_bar(stat="identity") +
scale_fill_gradientn(colors = colorRampPalette(c("grey60","#1e90ff"))(100),
limits = c(min(-log10(chr_res$pval)),max(-log10(chr_res$pval)))) +
geom_hline(yintercept = min(chr_res[chr_res$fdr<=fdr_qThresh,"percent"])-0.01, linetype="dashed") +
xlab("Chromosome") + ylab("Percentage DE genes") +
coord_flip() +
theme(
axis.text.x = element_text(size=fontSize+10),
axis.text.y = element_text(size=fontSize+10),
axis.title.x = element_text(size=fontSize+10),
strip.text.x = element_text(size=fontSize+10),
axis.title.y = element_text(size=fontSize+10))
ggsave(filename = file.path(plot_path, "DE+.interaction.pfc.chromosomePlot.pdf"),width=18, height=12)
p

# make a volcano plot
# interaction volcano plot
data2use = de_res_pfc[["interaction_table"]]
fname2save = file.path(plot_path, sprintf("DE.pfc.interaction.volcanoPlot%s.pdf",fstem))
colors2use = c("#ff8d1e","grey60","#1e90ff")
p = volcanoPlot(data2use = data2use,
fname2save = fname2save,
fdr_qThresh = fdr_qThresh,
plotTopGenes=FALSE,
colors2use=colors2use,
fontSize=40,
dotSize=1,
plotWidth=10,
plotHeight=8)
p

# sex volcano plot
data2use = de_res_pfc[["sex_table"]]
fname2save = file.path(plot_path, sprintf("DE.pfc.sex.volcanoPlot%s.pdf",fstem))
colors2use = c("#ff8d1e","grey60","#1e90ff")
p = volcanoPlot(data2use = data2use,
fname2save = fname2save,
fdr_qThresh = fdr_qThresh,
plotTopGenes=TRUE,
colors2use=colors2use,
fontSize=40,
dotSize=1,
plotWidth=10,
plotHeight=8)
p

# group volcano plot
data2use = de_res_pfc[["group_table"]]
fname2save = file.path(plot_path, sprintf("DE.pfc.group.volcanoPlot%s.pdf",fstem))
colors2use = c("grey60","#1e90ff")
p = volcanoPlot(data2use = data2use,
fname2save = fname2save,
fdr_qThresh = fdr_qThresh,
plotTopGenes=TRUE,
colors2use=colors2use,
fontSize=40,
dotSize=1,
plotWidth=10,
plotHeight=8)
p

HYT DE modeling
de_res_hyt = runDEmodel(expr_data = v_hyt, meta_data = meta_data_hyt,
form2use = form4model_hyt,
gene_meta_data = gene_meta_data)
# save tables
brain_region = "hyt"
fstem = sprintf("_filt%0.2f",varianceFilterCutoff)
table_names2use = c("Fstat","interaction","group","sex")
report2screen = sprintf("%s DE genes\n",brain_region)
for (itable in 1:length(table_names2use)){
fname2save = file.path(res_path,
brain_region,
sprintf("DE.%s_table%s.csv",
table_names2use[itable],
fstem))
tab2use = de_res_hyt[[sprintf("%s_table",table_names2use[itable])]]
tab2use = tab2use[order(tab2use$adj.P.Val),]
tab2use$DE = "NO"
tab2use$DE[tab2use$adj.P.Val<=fdr_qThresh] = "YES"
write.csv(tab2use, file = fname2save)
if (itable>1){
report2screen = c(report2screen,
sprintf("%s DE genes is %d\n",table_names2use[itable],
sum(tab2use$adj.P.Val<=fdr_qThresh)))
}
}
cat(report2screen)
## hyt DE genes
## interaction DE genes is 0
## group DE genes is 1
## sex DE genes is 723
# make a volcano plot
# interaction volcano plot
data2use = de_res_hyt[["interaction_table"]]
fname2save = file.path(plot_path, sprintf("DE.hyt.interaction.volcanoPlot%s.pdf",fstem))
colors2use = "grey60"
p = volcanoPlot(data2use = data2use,
fname2save = fname2save,
fdr_qThresh = fdr_qThresh,
plotTopGenes=FALSE,
colors2use=colors2use,
fontSize=40,
dotSize=1,
plotWidth=10,
plotHeight=8)
p

# sex volcano plot
data2use = de_res_hyt[["sex_table"]]
fname2save = file.path(plot_path, sprintf("DE.hyt.sex.volcanoPlot%s.pdf",fstem))
colors2use = c("#ff8d1e","grey60","#1e90ff")
p = volcanoPlot(data2use = data2use,
fname2save = fname2save,
fdr_qThresh = fdr_qThresh,
plotTopGenes=TRUE,
colors2use=colors2use,
fontSize=40,
dotSize=1,
plotWidth=10,
plotHeight=8)
p

# group volcano plot
data2use = de_res_hyt[["group_table"]]
fname2save = file.path(plot_path, sprintf("DE.hyt.group.volcanoPlot%s.pdf",fstem))
colors2use = c("grey60","#1e90ff")
p = volcanoPlot(data2use = data2use,
fname2save = fname2save,
fdr_qThresh = fdr_qThresh,
plotTopGenes=TRUE,
colors2use=colors2use,
fontSize=40,
dotSize=1,
plotWidth=10,
plotHeight=8)
p

ASD-related enrichment analyses with DE genes
# pre-allocate array with gene lists to use
gene_lists2use = c("SFARI", "ptLGD","dup15q DE+", "dup15q DE-",
"AS iPSC","dup15q iPSC",
"ASD DE+","ASD DE-","SCZ DE","BD DE")
cols2use = c("genelist","OR","pval","fdr")
gsea_res = data.frame(matrix(nrow = length(gene_lists2use), ncol = length(cols2use)))
rownames(gsea_res) = gene_lists2use
colnames(gsea_res) = cols2use
# get DE genes
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh
de_genes = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name,de_genes)
de_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name,de_genes)],incomparables = "")
de_genes_human_homologs = de_genes_human_homologs[2:length(de_genes_human_homologs)]
# get Pos logFC DE genes - i.e. DE-
mask = (de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC>0)
pos_de_genes = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name, pos_de_genes)
pos_de_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name, pos_de_genes)],incomparables = "")
pos_de_genes_human_homologs = pos_de_genes_human_homologs[2:length(pos_de_genes_human_homologs)]
# get Neg logFC DE genes - i.e. DE+
mask = (de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC<0)
neg_de_genes = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name, neg_de_genes)
neg_de_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name, neg_de_genes)],incomparables = "")
neg_de_genes_human_homologs = neg_de_genes_human_homologs[2:length(neg_de_genes_human_homologs)]
# get background list human homologs
mask = is.element(gene_meta_data$ensembl_gene_id, rownames(exprs_filt_pfc))
bglist = gene_meta_data$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name,bglist)
bglist_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name,bglist)],incomparables = "")
mask = bglist_human_homologs==""
bglist_human_homologs = bglist_human_homologs[!mask]
bg_num = length(bglist)
# load SFARI genes
sfari_genes = read.csv(file.path(data_path,"SFARI-Gene_genes_01-13-2021release_01-20-2021export.csv"))
sfari_genes_all = sfari_genes$gene.symbol
sfari_genes_all = sfari_genes_all[is.element(sfari_genes_all, bglist_human_homologs)]
# load private transmitted LGD genes (Wilfert et al., 2021), doesn't overlap with known DNM and SFARI genes
ptlgd_genes = read.csv(file.path(data_path,"wilfert2021_supptab17_privateInheritedLGD.csv"))
ptlgd_genes_all = ptlgd_genes$Gene
ptlgd_genes_all = ptlgd_genes_all[is.element(ptlgd_genes_all, bglist_human_homologs)]
# load Parikshak 2016 for Dup15q DE genes
parikshak2016 = read.csv(file.path(data_path,"parikshak2016_degenes.csv"))
mask = parikshak2016$FDR.adjusted.P.value..dup15q.vs.CTL<=0.05
de_dup15q_genes = parikshak2016$HGNC.Symbol[mask]
mask = de_dup15q_genes==""
de_dup15q_genes = de_dup15q_genes[!mask]
de_dup15q_genes = de_dup15q_genes[is.element(de_dup15q_genes, bglist_human_homologs)]
# upregulated in dup15q
mask = (parikshak2016$FDR.adjusted.P.value..dup15q.vs.CTL<=0.05) & (parikshak2016$log2.FC..dup15q.vs.CTL>0)
de_dup15q_genes = parikshak2016$HGNC.Symbol[mask]
mask = de_dup15q_genes==""
de_dup15q_UP_genes = de_dup15q_genes[!mask]
de_dup15q_UP_genes = de_dup15q_UP_genes[is.element(de_dup15q_UP_genes, bglist_human_homologs)]
# downregulated in dup15q
mask = (parikshak2016$FDR.adjusted.P.value..dup15q.vs.CTL<=0.05) & (parikshak2016$log2.FC..dup15q.vs.CTL<0)
de_dup15q_genes = parikshak2016$HGNC.Symbol[mask]
mask = de_dup15q_genes==""
de_dup15q_DOWN_genes = de_dup15q_genes[!mask]
de_dup15q_DOWN_genes = de_dup15q_DOWN_genes[is.element(de_dup15q_DOWN_genes, bglist_human_homologs)]
# load Germain 2014 for AS and dup15q iPSC DE genes
germain2014 = read.csv(file.path(data_path,"germain_2014_15qCNV_iPSC_rnaseq_data.csv"))
mask = germain2014$FoldChange_AS>=2 | germain2014$FoldChange_AS<=0.5
as_ipsc_deall_genes = unique(germain2014$gene_name[mask])
mask = as_ipsc_deall_genes==""
as_ipsc_deall_genes = as_ipsc_deall_genes[!mask]
as_ipsc_deall_genes = as_ipsc_deall_genes[is.element(as_ipsc_deall_genes, bglist_human_homologs)]
mask = germain2014$FoldChange_dupl>=2 | germain2014$FoldChange_dupl<=0.5
dup15q_ipsc_deall_genes = unique(germain2014$gene_name[mask])
mask = dup15q_ipsc_deall_genes==""
dup15q_ipsc_deall_genes = dup15q_ipsc_deall_genes[!mask]
dup15q_ipsc_deall_genes = dup15q_ipsc_deall_genes[is.element(dup15q_ipsc_deall_genes, bglist_human_homologs)]
# load Gandal 2018 for ASD, SCZ, and BD DE genes
# ASD
gandal2018 = read.csv(file.path(data_path,"gandal2018_degenes.csv"))
mask = gandal2018$ASD.fdr<=0.05
de_asd_genes = gandal2018$gene_name[mask]
mask = de_asd_genes==""
de_asd_genes = de_asd_genes[!mask]
de_asd_genes = de_asd_genes[is.element(de_asd_genes, bglist_human_homologs)]
# upregulated
mask = (gandal2018$ASD.fdr<=0.05) & (gandal2018$ASD.log2FC>0)
de_asd_genes = gandal2018$gene_name[mask]
mask = de_asd_genes==""
de_asd_UP_genes = de_asd_genes[!mask]
de_asd_UP_genes = de_asd_UP_genes[is.element(de_asd_UP_genes, bglist_human_homologs)]
# downregulated
mask = (gandal2018$ASD.fdr<=0.05) & (gandal2018$ASD.log2FC<0)
de_asd_genes = gandal2018$gene_name[mask]
mask = de_asd_genes==""
de_asd_DOWN_genes = de_asd_genes[!mask]
de_asd_DOWN_genes = de_asd_DOWN_genes[is.element(de_asd_DOWN_genes, bglist_human_homologs)]
# Schizophrenia
mask = gandal2018$SCZ.fdr<=0.05
de_scz_genes = gandal2018$gene_name[mask]
mask = de_scz_genes==""
de_scz_genes = de_scz_genes[!mask]
de_scz_genes = de_scz_genes[is.element(de_scz_genes, bglist_human_homologs)]
# Bipolar
mask = gandal2018$BD.fdr<=0.05
de_bd_genes = gandal2018$gene_name[mask]
mask = de_bd_genes==""
de_bd_genes = de_bd_genes[!mask]
de_bd_genes = de_bd_genes[is.element(de_bd_genes, bglist_human_homologs)]
gene_lists4loop = list(sfari_genes_all,
ptlgd_genes_all,
de_dup15q_UP_genes, de_dup15q_DOWN_genes,
as_ipsc_deall_genes, dup15q_ipsc_deall_genes,
de_asd_UP_genes, de_asd_DOWN_genes,
de_scz_genes,
de_bd_genes)
# All DE genes
for (ilist in 1:length(gene_lists4loop)){
list1 = de_genes_human_homologs
list2 = gene_lists4loop[[ilist]]
list2 = list2[is.element(list2,bglist_human_homologs)]
backgroundTotal = bg_num
overlap_res = genelistOverlap(list1 = list1, list2 = list2,
backgroundTotal = bg_num, print_result = FALSE)
if (ilist==1){
sfari_overlap = overlap_res[[1]]$overlapping_genes
sfari_genes_de_overlap = sfari_genes[is.element(sfari_genes$gene.symbol,sfari_overlap),]
write.csv(sfari_genes_de_overlap, file = file.path(res_path, "pfc", "sfari_de_overlap.csv"))
print(gene_lists2use[ilist])
print(overlap_res[[1]]$percent_overlap_list2)
print(length(overlap_res[[1]]$overlapping_genes))
print(overlap_res[[1]]$overlapping_genes)
} else if (ilist==2){
ptlgd_overlap = overlap_res[[1]]$overlapping_genes
ptlgd_genes_de_overlap = ptlgd_genes[is.element(ptlgd_genes$Gene,ptlgd_overlap),]
write.csv(ptlgd_genes_de_overlap, file = file.path(res_path, "pfc","ptlgd_de_overlap.csv"))
print(gene_lists2use[ilist])
print(overlap_res[[1]]$percent_overlap_list2)
print(length(overlap_res[[1]]$overlapping_genes))
print(overlap_res[[1]]$overlapping_genes)
}
gsea_res[gene_lists2use[ilist],"genelist"] = gene_lists2use[ilist]
gsea_res[gene_lists2use[ilist],"OR"] = overlap_res[[1]]$OR
gsea_res[gene_lists2use[ilist],"pval"] = overlap_res[[1]]$hypergeo_p
}
## [1] "SFARI"
## [1] 0.2530864
## [1] 205
## [1] "ABAT" "ACTN4" "ADCY5" "ADSS2" "AGAP1" "AGAP2"
## [7] "AGBL4" "ANKRD11" "ARHGEF10" "ARNT2" "ATRX" "BCKDK"
## [13] "BRAF" "BRSK2" "BRWD3" "CACNA2D1" "CADM2" "CAMK2A"
## [19] "CAMK4" "CAPRIN1" "CCSER1" "CDC42BPB" "CDH8" "CDKL5"
## [25] "CHD3" "CHM" "CIC" "CLCN4" "CNTN3" "CNTN4"
## [31] "CPEB4" "CSDE1" "CSNK1E" "CSNK1G1" "CSNK2A1" "CTNND2"
## [37] "CUL3" "CUX2" "CX3CR1" "DDHD2" "DDX3X" "DEAF1"
## [43] "DLG4" "DLGAP2" "DLGAP3" "DMD" "DMWD" "DOCK1"
## [49] "DOCK8" "DPP10" "DPP4" "EIF4E" "ELAVL2" "EP400"
## [55] "ERG" "EXOC5" "EXOC6" "FBRSL1" "FBXO11" "FBXO33"
## [61] "FGF14" "FMR1" "FRMPD4" "GABRA4" "GABRB2" "GABRB3"
## [67] "GALNT13" "GNAI1" "GPR85" "GRIA2" "GRID1" "GRIK5"
## [73] "GRIN2A" "GRM5" "GSTM1" "HCN1" "HDAC8" "HEPACAM"
## [79] "HNRNPH2" "HNRNPU" "HRAS" "IL1RAPL1" "ILF2" "INTS6"
## [85] "IQSEC2" "KANK1" "KCND2" "KCNJ10" "KCNMA1" "KCTD13"
## [91] "KRR1" "LNPK" "LRP1" "LZTS2" "MAP1A" "MAPK3"
## [97] "MBD3" "MED13" "MEF2C" "MEMO1" "MET" "METTL26"
## [103] "MIB1" "MNT" "NAA15" "NACC1" "NCKAP1" "NCOR1"
## [109] "NDUFA5" "NEGR1" "NEXMIF" "NIPA1" "NIPA2" "NPAS2"
## [115] "NSD2" "NUDCD2" "PACS2" "PAFAH1B2" "PCDH11X" "PCDHA1"
## [121] "PCDHA11" "PCDHA4" "PCDHA13" "PCDHA2" "PCDHA3" "PCDHA5"
## [127] "PCDHA8" "PCDHA6" "PCDHA9" "PCDHA7" "PCDHA10" "PCDHAC1"
## [133] "PCM1" "PDK2" "PHF2" "PHIP" "PHRF1" "PIK3R2"
## [139] "PLCB1" "PLXNB1" "POMGNT1" "PPP1R9B" "PPP2R5D" "PREX1"
## [145] "PRODH" "PRPF39" "PRR12" "PSMD12" "PTBP2" "PTEN"
## [151] "PTPRB" "RAB39B" "RALA" "RBM27" "REEP3" "RHEB"
## [157] "RIMS3" "RLIM" "ROBO2" "RPS6KA3" "SAE1" "SASH1"
## [163] "SBF1" "SCN1A" "SCN2A" "SET" "SF3B1" "SGSH"
## [169] "SHANK1" "SHANK3" "SKI" "SLC1A1" "SLC1A2" "SLC25A39"
## [175] "SLC4A10" "SLC6A1" "SLC7A5" "SMARCC2" "SNX14" "SOX5"
## [181] "SPARCL1" "SYNCRIP" "TAF6" "TAOK1" "TCF4" "TFB2M"
## [187] "THRA" "TMEM134" "TNS2" "TRAPPC9" "TRIM23" "TRIM33"
## [193] "TRIP12" "TSC2" "TSPAN7" "UBR1" "UPF2" "USP15"
## [199] "USP45" "WDFY4" "WDR26" "XPO1" "YTHDC2" "ZC3H4"
## [205] "ZMYM2"
## [1] "ptLGD"
## [1] 0.3377483
## [1] 51
## [1] "AGPS" "APPBP2" "ARHGAP31" "ATP2C1" "BRPF3" "CAD"
## [7] "CASKIN1" "CCAR1" "CCNJ" "CEPT1" "CIC" "COPB2"
## [13] "CUL2" "DAB2IP" "DLG5" "DYNC1LI1" "EYA1" "FADS2"
## [19] "FAM169A" "FXR1" "GMPS" "GSK3A" "IPO13" "KLHL2"
## [25] "LRP1" "MMP14" "MORC3" "NCAPG2" "NOL6" "NPAS1"
## [31] "PHLPP1" "PLCG2" "PODXL2" "PPFIA3" "PRRC2A" "PSME4"
## [37] "RABL6" "SCAF1" "SCAI" "SENP5" "SIPA1L1" "SMC2"
## [43] "SPTB" "TLN1" "TMEM201" "TNRC18" "TRAPPC8" "TP53BP1"
## [49] "ZNF423" "ZNF638" "ZYG11B"
gsea_res[,"fdr"] = p.adjust(gsea_res[,"pval"], method = "fdr")
gsea_res
## genelist OR pval fdr
## SFARI SFARI 1.575795 1.394132e-02 2.788264e-02
## ptLGD ptLGD 2.331456 6.209919e-04 2.069973e-03
## dup15q DE+ dup15q DE+ 1.028066 8.465379e-01 9.086679e-01
## dup15q DE- dup15q DE- 2.390782 1.900680e-07 9.503399e-07
## AS iPSC AS iPSC 1.212307 9.086679e-01 9.086679e-01
## dup15q iPSC dup15q iPSC 1.789369 8.953920e-08 8.953920e-07
## ASD DE+ ASD DE+ 1.286554 5.413257e-01 8.772497e-01
## ASD DE- ASD DE- 1.630936 9.825426e-03 2.456356e-02
## SCZ DE SCZ DE 1.345796 6.140748e-01 8.772497e-01
## BD DE BD DE 1.146636 8.928758e-01 9.086679e-01
# p = ggplot(data = gsea_res, aes(x = reorder(genelist, OR), y = -log10(pval), fill = OR)) +
p = ggplot(data = gsea_res, aes(x = reorder(genelist, -log10(pval)), y = -log10(pval), fill = OR)) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.03), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(1,max(gsea_res$OR))) +
coord_flip()
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.humanEnrichments%s.pdf",fstem)))
p

# Pos logFC - i.e. DE-
for (ilist in 1:length(gene_lists4loop)){
list1 = pos_de_genes_human_homologs
list2 = gene_lists4loop[[ilist]]
list2 = list2[is.element(list2,bglist_human_homologs)]
backgroundTotal = bg_num
overlap_res = genelistOverlap(list1 = list1, list2 = list2,
backgroundTotal = bg_num, print_result = FALSE)
if (ilist==1){
print(gene_lists2use[ilist])
print(overlap_res[[1]]$percent_overlap_list2)
print(length(overlap_res[[1]]$overlapping_genes))
print(overlap_res[[1]]$overlapping_genes)
}
gsea_res[gene_lists2use[ilist],"genelist"] = gene_lists2use[ilist]
gsea_res[gene_lists2use[ilist],"OR"] = overlap_res[[1]]$OR
gsea_res[gene_lists2use[ilist],"pval"] = overlap_res[[1]]$hypergeo_p
}
## [1] "SFARI"
## [1] 0.1506173
## [1] 122
## [1] "ADSS2" "AGBL4" "ATRX" "BRAF" "BRWD3" "CACNA2D1"
## [7] "CADM2" "CAMK4" "CAPRIN1" "CCSER1" "CDH8" "CDKL5"
## [13] "CHM" "CLCN4" "CNTN3" "CNTN4" "CPEB4" "CSDE1"
## [19] "CSNK1G1" "CSNK2A1" "CUL3" "DDHD2" "DDX3X" "DMD"
## [25] "DPP10" "DPP4" "EIF4E" "ELAVL2" "EXOC5" "EXOC6"
## [31] "FBXO11" "FBXO33" "FGF14" "FMR1" "FRMPD4" "GABRA4"
## [37] "GABRB2" "GABRB3" "GALNT13" "GNAI1" "GPR85" "GRIA2"
## [43] "GRIN2A" "GRM5" "HCN1" "HDAC8" "HNRNPH2" "HNRNPU"
## [49] "IL1RAPL1" "ILF2" "INTS6" "KCND2" "KCNMA1" "KRR1"
## [55] "LNPK" "MED13" "MEF2C" "MEMO1" "MET" "MIB1"
## [61] "NAA15" "NCKAP1" "NDUFA5" "NEGR1" "NEXMIF" "NIPA1"
## [67] "NIPA2" "NSD2" "NUDCD2" "PAFAH1B2" "PCDH11X" "PCDHA1"
## [73] "PCDHA11" "PCDHA4" "PCDHA13" "PCDHA2" "PCDHA3" "PCDHA5"
## [79] "PCDHA8" "PCDHA6" "PCDHA9" "PCDHA7" "PCDHA10" "PCDHAC1"
## [85] "PCM1" "PHIP" "PLCB1" "PRPF39" "PSMD12" "PTBP2"
## [91] "PTEN" "RAB39B" "RALA" "RBM27" "RHEB" "RLIM"
## [97] "ROBO2" "RPS6KA3" "SAE1" "SCN1A" "SCN2A" "SET"
## [103] "SF3B1" "SLC1A1" "SLC4A10" "SNX14" "SOX5" "SYNCRIP"
## [109] "TAOK1" "TCF4" "TFB2M" "TRIM23" "TRIM33" "TRIP12"
## [115] "UBR1" "UPF2" "USP15" "USP45" "WDR26" "XPO1"
## [121] "YTHDC2" "ZMYM2"
gsea_res[,"fdr"] = p.adjust(gsea_res[,"pval"], method = "fdr")
poslogfc_res = gsea_res
gsea_res
## genelist OR pval fdr
## SFARI SFARI 1.4438127 1.769705e-02 4.424262e-02
## ptLGD ptLGD 1.4340875 1.918718e-01 3.837435e-01
## dup15q DE+ dup15q DE+ 0.4652926 9.952573e-01 9.999477e-01
## dup15q DE- dup15q DE- 3.7788889 4.149096e-20 2.074548e-19
## AS iPSC AS iPSC 1.2082040 3.372211e-01 5.620351e-01
## dup15q iPSC dup15q iPSC 2.6577975 6.195095e-46 6.195095e-45
## ASD DE+ ASD DE+ 0.6071054 9.999477e-01 9.999477e-01
## ASD DE- ASD DE- 2.1676101 4.752783e-09 1.584261e-08
## SCZ DE SCZ DE 0.9348247 9.997996e-01 9.999477e-01
## BD DE BD DE 0.8422789 9.903868e-01 9.999477e-01
# p = ggplot(data = gsea_res, aes(x = reorder(genelist, OR), y = -log10(pval), fill = OR)) +
p = ggplot(data = gsea_res, aes(x = reorder(genelist, -log10(pval)), y = -log10(pval), fill = OR)) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.03), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(1,max(gsea_res$OR))) +
coord_flip()
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.PosLogFC.humanEnrichments%s.pdf",fstem)))
p

# Neg logFC - i.e. DE+
for (ilist in 1:length(gene_lists4loop)){
list1 = neg_de_genes_human_homologs
list2 = gene_lists4loop[[ilist]]
list2 = list2[is.element(list2,bglist_human_homologs)]
backgroundTotal = bg_num
overlap_res = genelistOverlap(list1 = list1, list2 = list2,
backgroundTotal = bg_num, print_result = FALSE)
gsea_res[gene_lists2use[ilist],"genelist"] = gene_lists2use[ilist]
gsea_res[gene_lists2use[ilist],"OR"] = overlap_res[[1]]$OR
gsea_res[gene_lists2use[ilist],"pval"] = overlap_res[[1]]$hypergeo_p
}
gsea_res[,"fdr"] = p.adjust(gsea_res[,"pval"], method = "fdr")
neglogfc_res = gsea_res
gsea_res
## genelist OR pval fdr
## SFARI SFARI 1.2148546 0.2486948001 0.414491334
## ptLGD ptLGD 2.4260617 0.0004280837 0.001426946
## dup15q DE+ dup15q DE+ 1.5718039 0.1447685543 0.361921386
## dup15q DE- dup15q DE- 0.3087027 0.9999996956 1.000000000
## AS iPSC AS iPSC 0.8838114 0.9924895394 1.000000000
## dup15q iPSC dup15q iPSC 0.5794839 1.0000000000 1.000000000
## ASD DE+ ASD DE+ 1.8783869 0.0001400588 0.001400588
## ASD DE- ASD DE- 0.5851405 0.9999573543 1.000000000
## SCZ DE SCZ DE 1.4541750 0.0002836024 0.001418012
## BD DE BD DE 1.2444558 0.2228411638 0.414491334
# p = ggplot(data = gsea_res, aes(x = reorder(genelist, OR), y = -log10(pval), fill = OR)) +
p = ggplot(data = gsea_res, aes(x = reorder(genelist, -log10(pval)), y = -log10(pval), fill = OR)) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.03), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(1,max(gsea_res$OR))) +
coord_flip()
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.NegLogFC.humanEnrichments%s.pdf",fstem)))
p

Sex hormone-related enrichment analyses with DE genes
# pre-allocate array with gene lists to use
gene_lists2use = c("DHT DE+","DHT DE-","EST DE+","EST DE-",
"AR Targets","ER Targets","Male DT","Female DT","SexDiv DT")
cols2use = c("genelist","OR","pval","fdr")
gsea_res = data.frame(matrix(nrow = length(gene_lists2use), ncol = length(cols2use)))
rownames(gsea_res) = gene_lists2use
colnames(gsea_res) = cols2use
# get DE genes
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh
de_genes = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name,de_genes)
de_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name,de_genes)],incomparables = "")
de_genes_human_homologs = de_genes_human_homologs[2:length(de_genes_human_homologs)]
# get Pos logFC DE genes - i.e. DE-
mask = (de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC>0)
pos_de_genes = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name, pos_de_genes)
pos_de_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name, pos_de_genes)],incomparables = "")
pos_de_genes_human_homologs = pos_de_genes_human_homologs[2:length(pos_de_genes_human_homologs)]
# get Neg logFC DE genes - i.e. DE+
mask = (de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC<0)
neg_de_genes = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name, neg_de_genes)
neg_de_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name, neg_de_genes)],incomparables = "")
neg_de_genes_human_homologs = neg_de_genes_human_homologs[2:length(neg_de_genes_human_homologs)]
# get background list human homologs
mask = is.element(gene_meta_data$ensembl_gene_id, rownames(exprs_filt_pfc))
bglist = gene_meta_data$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name,bglist)
bglist_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[is.element(hsap_homolog_gene_meta_data$external_gene_name,bglist)],incomparables = "")
mask = bglist_human_homologs==""
bglist_human_homologs = bglist_human_homologs[!mask]
bg_num = length(bglist)
# load DHT genes
dht_genes = read.csv(file.path(data_path,"lombardo_2020_degenes_DHT_hNSC.csv"))
dht_genes_all = dht_genes$GeneSymbol
dht_genes_all = dht_genes_all[is.element(dht_genes_all, bglist_human_homologs)]
# upregulated
dht_genes_UP = dht_genes$GeneSymbol[dht_genes$direction=="upreg"]
dht_genes_UP = dht_genes_UP[is.element(dht_genes_UP, bglist_human_homologs)]
# downregulated
dht_genes_DOWN = dht_genes$GeneSymbol[dht_genes$direction=="downreg"]
dht_genes_DOWN = dht_genes_DOWN[is.element(dht_genes_DOWN, bglist_human_homologs)]
# load EST genes
est_genes = read.csv(file.path(data_path,"csoregh_2009_EST_DE.csv"))
est_genes_all = est_genes$GeneSymbol
est_genes_all = unique(est_genes_all[!is.na(est_genes_all)])
est_genes_all = est_genes_all[is.element(est_genes_all, bglist_human_homologs)]
# upregulated
est_genes_UP = est_genes$GeneSymbol[est_genes$direction=="up"]
est_genes_UP = unique(est_genes_UP[!is.na(est_genes_UP)])
est_genes_UP = est_genes_UP[is.element(est_genes_UP, bglist_human_homologs)]
# dowrnegulated
est_genes_DOWN = est_genes$GeneSymbol[est_genes$direction=="down"]
est_genes_DOWN = unique(est_genes_DOWN[!is.na(est_genes_DOWN)])
est_genes_DOWN = est_genes_UP[is.element(est_genes_DOWN, bglist_human_homologs)]
# load AR-target genes
artarg_genes = read.csv(file.path(data_path,"quartier_2019_ARtargetGenes_ChIPSeq_hNSC.csv"))
artarg_genes_all = artarg_genes$GeneSymbol
artarg_genes_all = artarg_genes_all[is.element(artarg_genes_all, bglist_human_homologs)]
# load ER-alpha target genes
ertarg_genes = read_excel(file.path(data_path,"gegenhuber_Data_S1_ERa_peaks_final.xlsx"), sheet = "Brain-specific peaks")
ertarg_genes = data.frame(ertarg_genes)
ertarg_genes_all = as.character(ertarg_genes[,"Gene.annotation"])
mask = is.element(hsap_homolog_gene_meta_data$external_gene_name, ertarg_genes_all)
ertarg_genes_human_homologs = unique(hsap_homolog_gene_meta_data$hsapiens_homolog_associated_gene_name[mask],incomparables = "")
ertarg_genes_all = ertarg_genes_human_homologs[!ertarg_genes_human_homologs==""]
ertarg_genes_all = ertarg_genes_all[is.element(ertarg_genes_all, bglist_human_homologs)]
# load sex differentially targeted genes
male_dt_genes = read.csv(file.path(data_path,"lopes-ramos_2020_DTgenes.csv"))
male_dt_genes = male_dt_genes$GeneSymbol[male_dt_genes$DTclass=="Male-biased"]
male_dt_genes = male_dt_genes[is.element(male_dt_genes, bglist_human_homologs)]
female_dt_genes = read.csv(file.path(data_path,"lopes-ramos_2020_DTgenes.csv"))
female_dt_genes = female_dt_genes$GeneSymbol[female_dt_genes$DTclass=="Female-biased"]
female_dt_genes = female_dt_genes[is.element(female_dt_genes, bglist_human_homologs)]
sexdiv_dt_genes = read.csv(file.path(data_path,"lopes-ramos_2020_DTgenes.csv"))
sexdiv_dt_genes = sexdiv_dt_genes$GeneSymbol[sexdiv_dt_genes$DTclass=="Sex-divergent"]
sexdiv_dt_genes = sexdiv_dt_genes[is.element(sexdiv_dt_genes, bglist_human_homologs)]
gene_lists4loop = list(dht_genes_UP, dht_genes_DOWN,
est_genes_UP, est_genes_DOWN,
artarg_genes_all,
ertarg_genes_all,
male_dt_genes,
female_dt_genes,
sexdiv_dt_genes)
brain_region = "pfc"
# Pos logFC - i.e. DE-
for (ilist in 1:length(gene_lists4loop)){
list1 = pos_de_genes_human_homologs
list2 = gene_lists4loop[[ilist]]
list2 = list2[is.element(list2,bglist_human_homologs)]
backgroundTotal = bg_num
overlap_res = genelistOverlap(list1 = list1, list2 = list2,
backgroundTotal = bg_num, print_result = FALSE)
gsea_res[gene_lists2use[ilist],"genelist"] = gene_lists2use[ilist]
gsea_res[gene_lists2use[ilist],"OR"] = overlap_res[[1]]$OR
gsea_res[gene_lists2use[ilist],"pval"] = overlap_res[[1]]$hypergeo_p
}
gsea_res[,"fdr"] = p.adjust(gsea_res[,"pval"], method = "fdr")
poslogfc_sexhormone_res = gsea_res
gsea_res
## genelist OR pval fdr
## DHT DE+ DHT DE+ 1.3394438 1.825064e-01 5.475191e-01
## DHT DE- DHT DE- 0.1891144 9.999942e-01 1.000000e+00
## EST DE+ EST DE+ 1.9920506 1.050971e-01 4.729370e-01
## EST DE- EST DE- 1.3250323 4.762096e-01 8.571773e-01
## AR Targets AR Targets 0.8140267 1.000000e+00 1.000000e+00
## ER Targets ER Targets 1.0918760 6.684622e-01 1.000000e+00
## Male DT Male DT 0.4566315 1.000000e+00 1.000000e+00
## Female DT Female DT 1.9977308 2.596165e-07 2.336549e-06
## SexDiv DT SexDiv DT 1.2341137 3.067365e-01 6.901571e-01
p = ggplot(data = gsea_res, aes(x = reorder(genelist, OR), y = -log10(pval), fill = OR)) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.03), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(1,max(gsea_res$OR))) +
coord_flip()
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.PosLogFC.SexHormoneEnrichments%s.pdf",fstem)))
p

# Neg logFC - i.e. DE+
for (ilist in 1:length(gene_lists4loop)){
list1 = neg_de_genes_human_homologs
list2 = gene_lists4loop[[ilist]]
list2 = list2[is.element(list2,bglist_human_homologs)]
backgroundTotal = bg_num
overlap_res = genelistOverlap(list1 = list1, list2 = list2,
backgroundTotal = bg_num, print_result = FALSE)
gsea_res[gene_lists2use[ilist],"genelist"] = gene_lists2use[ilist]
gsea_res[gene_lists2use[ilist],"OR"] = overlap_res[[1]]$OR
gsea_res[gene_lists2use[ilist],"pval"] = overlap_res[[1]]$hypergeo_p
}
gsea_res[,"fdr"] = p.adjust(gsea_res[,"pval"], method = "fdr")
neglogfc_sexhormone_res = gsea_res
gsea_res
## genelist OR pval fdr
## DHT DE+ DHT DE+ 1.0106258 7.099297e-01 9.997345e-01
## DHT DE- DHT DE- 2.2034290 4.484512e-03 1.345354e-02
## EST DE+ EST DE+ 0.4878553 9.366615e-01 9.997345e-01
## EST DE- EST DE- 0.3886517 9.393973e-01 9.997345e-01
## AR Targets AR Targets 1.7105866 7.282498e-10 6.554248e-09
## ER Targets ER Targets 0.6071263 9.997345e-01 9.997345e-01
## Male DT Male DT 2.0990700 4.238625e-08 1.907381e-07
## Female DT Female DT 0.6863210 9.991975e-01 9.997345e-01
## SexDiv DT SexDiv DT 1.4805624 1.855500e-02 4.174876e-02
p = ggplot(data = gsea_res, aes(x = reorder(genelist, OR), y = -log10(pval), fill = OR)) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.03), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(1,max(gsea_res$OR))) +
coord_flip()
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.NegLogFC.SexHormoneEnrichments%s.pdf",fstem)))
p

Plot enrichments as heatmap
poslogfc_res$type = "DE-"
neglogfc_res$type = "DE+"
poslogfc_sexhormone_res$type = "DE-"
neglogfc_sexhormone_res$type = "DE+"
df_gsea = rbind(poslogfc_res, neglogfc_res, poslogfc_sexhormone_res, neglogfc_sexhormone_res)
df_gsea$genelist = factor(df_gsea$genelist,
levels = rev(c("dup15q iPSC",
"dup15q DE-","dup15q DE+",
"AS iPSC",
"SFARI", "ptLGD",
"ASD DE+","ASD DE-",
"SCZ DE","BD DE",
"DHT DE+","DHT DE-",
"EST DE+","EST DE-",
"AR Targets","ER Targets",
"Male DT","Female DT","SexDiv DT")))
df_gsea$type = factor(df_gsea$type, levels = c("DE+", "DE-"))
df_gsea
## genelist OR pval fdr type
## SFARI SFARI 1.4438127 1.769705e-02 4.424262e-02 DE-
## ptLGD ptLGD 1.4340875 1.918718e-01 3.837435e-01 DE-
## dup15q DE+ dup15q DE+ 0.4652926 9.952573e-01 9.999477e-01 DE-
## dup15q DE- dup15q DE- 3.7788889 4.149096e-20 2.074548e-19 DE-
## AS iPSC AS iPSC 1.2082040 3.372211e-01 5.620351e-01 DE-
## dup15q iPSC dup15q iPSC 2.6577975 6.195095e-46 6.195095e-45 DE-
## ASD DE+ ASD DE+ 0.6071054 9.999477e-01 9.999477e-01 DE-
## ASD DE- ASD DE- 2.1676101 4.752783e-09 1.584261e-08 DE-
## SCZ DE SCZ DE 0.9348247 9.997996e-01 9.999477e-01 DE-
## BD DE BD DE 0.8422789 9.903868e-01 9.999477e-01 DE-
## SFARI1 SFARI 1.2148546 2.486948e-01 4.144913e-01 DE+
## ptLGD1 ptLGD 2.4260617 4.280837e-04 1.426946e-03 DE+
## dup15q DE+1 dup15q DE+ 1.5718039 1.447686e-01 3.619214e-01 DE+
## dup15q DE-1 dup15q DE- 0.3087027 9.999997e-01 1.000000e+00 DE+
## AS iPSC1 AS iPSC 0.8838114 9.924895e-01 1.000000e+00 DE+
## dup15q iPSC1 dup15q iPSC 0.5794839 1.000000e+00 1.000000e+00 DE+
## ASD DE+1 ASD DE+ 1.8783869 1.400588e-04 1.400588e-03 DE+
## ASD DE-1 ASD DE- 0.5851405 9.999574e-01 1.000000e+00 DE+
## SCZ DE1 SCZ DE 1.4541750 2.836024e-04 1.418012e-03 DE+
## BD DE1 BD DE 1.2444558 2.228412e-01 4.144913e-01 DE+
## DHT DE+ DHT DE+ 1.3394438 1.825064e-01 5.475191e-01 DE-
## DHT DE- DHT DE- 0.1891144 9.999942e-01 1.000000e+00 DE-
## EST DE+ EST DE+ 1.9920506 1.050971e-01 4.729370e-01 DE-
## EST DE- EST DE- 1.3250323 4.762096e-01 8.571773e-01 DE-
## AR Targets AR Targets 0.8140267 1.000000e+00 1.000000e+00 DE-
## ER Targets ER Targets 1.0918760 6.684622e-01 1.000000e+00 DE-
## Male DT Male DT 0.4566315 1.000000e+00 1.000000e+00 DE-
## Female DT Female DT 1.9977308 2.596165e-07 2.336549e-06 DE-
## SexDiv DT SexDiv DT 1.2341137 3.067365e-01 6.901571e-01 DE-
## DHT DE+1 DHT DE+ 1.0106258 7.099297e-01 9.997345e-01 DE+
## DHT DE-1 DHT DE- 2.2034290 4.484512e-03 1.345354e-02 DE+
## EST DE+1 EST DE+ 0.4878553 9.366615e-01 9.997345e-01 DE+
## EST DE-1 EST DE- 0.3886517 9.393973e-01 9.997345e-01 DE+
## AR Targets1 AR Targets 1.7105866 7.282498e-10 6.554248e-09 DE+
## ER Targets1 ER Targets 0.6071263 9.997345e-01 9.997345e-01 DE+
## Male DT1 Male DT 2.0990700 4.238625e-08 1.907381e-07 DE+
## Female DT1 Female DT 0.6863210 9.991975e-01 9.997345e-01 DE+
## SexDiv DT1 SexDiv DT 1.4805624 1.855500e-02 4.174876e-02 DE+
fontSize = 5
p = ggplot(data = df_gsea) +
geom_tile(aes(y = genelist, x = type, fill= -log10(pval))) +
geom_text(aes(y=genelist, x=type,label = round(OR,2)), size = fontSize) +
# scale_fill_gradient(low = "white", high="red") + ylab("")+xlab("") +
scale_fill_gradient(low = "white", high="red",
limits = c(0, 20),
oob = scales::squish) + ylab("")+xlab("") +
theme(
# Remove panel border
panel.border = element_blank(),
# Remove panel grid lines
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# Remove panel background
panel.background = element_blank(),
axis.text.x = element_text(size=fontSize),
axis.text.y = element_text(size=fontSize+10),
axis.title.x = element_text(size=fontSize),
strip.text.x = element_text(size=fontSize),
axis.title.y = element_text(size=fontSize))
ggsave(filename = file.path(plot_path, sprintf("DE.interaction.pfc.EnrichmentHeatmap%s.pdf",fstem)),width = 5, height=5)
p

Plot specific genes of interest
PLOT = FALSE
if (PLOT){
dotSize = 10
fontSize = 40
# get DE genes
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<fdr_qThresh
genes2plot = de_res_pfc[["interaction_table"]]$ensembl_id[mask]
gene_names = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = is.element(hsap_homolog_gene_meta_data$hsapiens_homolog_ensembl_gene, sfari_genes_de_overlap$ensembl.id)
gene_names2print = hsap_homolog_gene_meta_data$external_gene_name[mask]
for (igene in 1:length(genes2plot)){
gene2plot = genes2plot[igene]
gene_name = gene_names[igene]
exprdata2plot = subset(v_pfc$E,is.element(rownames(v_pfc$E),gene2plot))
data2plot = data.frame(meta_data_pfc, expr_data = t(exprdata2plot))
colnames(data2plot)[dim(data2plot)[2]] = "expr_data"
p = ggplot(data = data2plot, aes(x = group, y = expr_data, colour = sex)) + facet_grid(. ~ sex)
p = p + geom_jitter(size = dotSize) + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
p = p + ylab("log(CPM)") + xlab("") + ggtitle(gene_name) +
theme(axis.text.x = element_text(size=fontSize),
axis.text.y = element_text(size=fontSize),
axis.title.x = element_text(size=fontSize),
strip.text.x = element_text(size=fontSize),
axis.title.y = element_text(size=fontSize),
plot.title = element_text(hjust = 0.5, size=fontSize))
ggsave(filename = file.path(plot_path, sprintf("%s.pdf",gene_name)), width=10,height=8)
}
}
Enrichment analysis with Allen cell types
plotWidth = 10
plotHeight = 7
# specify which databases to query
dbs = c("Allen_Brain_Atlas_10x_scRNA_2021")
dbs_names = c("Allen10x_scRNA_2021")
topNterms = 25
brain_region = "pfc"
de_genes2use = de_res_pfc[["interaction_table"]]$external_gene_name[de_res_pfc[["interaction_table"]]$adj.P.Val<=fdr_qThresh]
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<=fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC>0
de_genes2use1 = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
mask = de_res_pfc[["interaction_table"]]$adj.P.Val<=fdr_qThresh & de_res_pfc[["interaction_table"]]$logFC<0
de_genes2use2 = de_res_pfc[["interaction_table"]]$external_gene_name[mask]
if (!is_empty(de_genes2use)){
enrichment_res = enrichr(de_genes2use, dbs)
enrichment_res1 = enrichr(de_genes2use1, dbs)
enrichment_res2 = enrichr(de_genes2use2, dbs)
for (idb in 1:length(dbs_names)){
# all DE genes
go_res = enrichment_res[[dbs[idb]]]
write.csv(go_res, file = file.path(res_path,
brain_region,
sprintf("DE.interaction.%s%s.csv",
dbs_names[idb],
fstem)))
data4plot = go_res[go_res$Adjusted.P.value<=fdr_qThresh,1:(ncol(go_res)-1)]
data4plot = data4plot[1:topNterms,]
p = ggplot(data = data4plot,
aes(x = reorder(Term, -log10(P.value)),
y = -log10(P.value),
fill = -log10(P.value))) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.05), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(-log10(0.05),max(-log10(data4plot$P.value)))) +
coord_flip()
ggsave(filename = file.path(plot_path,
sprintf("DE.interaction.%s.%s%s.pdf",
brain_region,
dbs_names[idb],
fstem)),
width = plotWidth, height = plotHeight)
# DE-
go_res = enrichment_res1[[dbs[idb]]]
write.csv(go_res, file = file.path(res_path,
brain_region,
sprintf("DE.interaction.%s.PosLogFC%s.csv",
dbs_names[idb],
fstem)))
data4plot = go_res[go_res$Adjusted.P.value<=fdr_qThresh,1:(ncol(go_res)-1)]
data4plot = data4plot[1:topNterms,]
p = ggplot(data = data4plot, aes(x = reorder(Term, -log10(P.value)), y = -log10(P.value), fill = -log10(P.value))) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.05), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(-log10(0.05),max(-log10(data4plot$P.value)))) +
coord_flip()
ggsave(filename = file.path(plot_path,
sprintf("DE.interaction.%s.%s.PosLogFC%s.pdf",
brain_region,
dbs_names[idb],
fstem)),
width = plotWidth, height = plotHeight)
# DE+
go_res = enrichment_res2[[dbs[idb]]]
write.csv(go_res, file = file.path(res_path,
brain_region,
sprintf("DE.interaction.%s.NegLogFC%s.csv",
dbs_names[idb],
fstem)))
data4plot = go_res[go_res$Adjusted.P.value<=fdr_qThresh,1:(ncol(go_res)-1)]
data4plot = data4plot[1:topNterms,]
p = ggplot(data = data4plot, aes(x = reorder(Term, -log10(P.value)),
y = -log10(P.value),
fill = -log10(P.value))) +
geom_bar(stat = "identity") +
ylab("-log10(p)") +
xlab(" ") +
geom_hline(yintercept = -log10(0.05), linetype="dashed") +
scale_fill_gradientn(colors = colorRampPalette(c("white","red"))(100),
limits = c(-log10(0.05),max(-log10(data4plot$P.value)))) +
coord_flip()
ggsave(filename = file.path(plot_path,
sprintf("DE.interaction.%s.%s.NegLogFC%s.pdf",
brain_region,
dbs_names[idb],
fstem)),
width = plotWidth, height = plotHeight)
} # for (idb in 1:length(dbs_names)){
} # if (!is_empty(de_genes2use)){
## Uploading data to Enrichr... Done.
## Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
## Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
## Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
## Parsing results... Done.
Annotate Allen cell type enrichments
brain_region = "pfc"
fnames2loop = c("DE.interaction.Allen10x_scRNA_2021","DE.interaction.Allen10x_scRNA_2021.NegLogFC",
"DE.interaction.Allen10x_scRNA_2021.PosLogFC")
for (i in 1:length(fnames2loop)){
celltype_data = read.csv(file.path(res_path, brain_region,sprintf("%s_filt0.15.csv",fnames2loop[i])))
celltype_data = annotate_celltypes(celltype_data)
celltype_data = subset(celltype_data, celltype_data$Adjusted.P.value<=fdr_qThresh)
celltype_data = celltype_data[order(celltype_data$class, decreasing = FALSE),]
write.csv(celltype_data, file = file.path(res_path, brain_region,sprintf("%s_FINAL_filt0.15.csv",fnames2loop[i])))
}# for (i in 1:length(fnames2loop)){
# -----------------------------------------------------------------------------
# plot DE neg and pos log FC direction in one plot
i=2
celltype_data = read.csv(file.path(res_path, brain_region,sprintf("%s_filt0.15.csv",fnames2loop[i])))
celltype_data = annotate_celltypes(celltype_data)
de_neg_celltypes = subset(celltype_data, celltype_data$Adjusted.P.value<=fdr_qThresh)
de_neg_celltypes$direction = "DE+" #"Neg log2(FC)"
i=3
celltype_data = read.csv(file.path(res_path, brain_region,sprintf("%s_filt0.15.csv",fnames2loop[i])))
celltype_data = annotate_celltypes(celltype_data)
de_pos_celltypes = subset(celltype_data, celltype_data$Adjusted.P.value<=fdr_qThresh)
de_pos_celltypes$direction = "DE-" # "Pos log2(FC)"
de_celltypes = rbind(de_pos_celltypes, de_neg_celltypes)
de_celltypes$direction = factor(de_celltypes$direction, levels = c("DE+","DE-"))
# get non-duplicated cluster IDs and final_label2
tmp = de_celltypes[,c("cluster_id","final_label2","class_label","plotcolor2use")]
tmp = tmp[!duplicated(tmp),]
tmp = tmp[order(tmp$cluster_id),]
tmp$cluster_id = factor(tmp$cluster_id)
tmp$final_label2 = factor(tmp$final_label2)
tmp$plotcolor2use = factor(tmp$plotcolor2use)
tmp$class_label = factor(tmp$class_label)
tmp = tmp %>% mutate(
name = glue("<b style='color:{plotcolor2use}'>{final_label2}</b>"),
name = fct_reorder(name, -as.numeric(cluster_id))
)
# turn cluster id into a factor and then reverse the ordering of the levels
de_celltypes$cluster_id = factor(de_celltypes$cluster_id)
de_celltypes$cluster_id = fct_rev(de_celltypes$cluster_id)
df2plot = merge(de_celltypes, tmp[,c("final_label2","name")], by = "final_label2")
p = ggplot(data = df2plot, aes(x = name, y = -log10(P.value), fill=class_label)) +
facet_grid(. ~ direction) +
geom_bar(stat = "identity") +
# scale_x_discrete(limits = rev(levels(tmp$cluster_id)), labels = rev(tmp$final_label2)) +
xlab("Cell Types") + ylab("-log10(p)") + coord_flip() + ggtitle(" ") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.y = element_markdown())
ggsave(filename = file.path(plot_path, "DE.celltypes.pdf"), height=12, width = 12)
p

p = ggplot(data = df2plot, aes(x = direction, y = name), fill = class_label) +
geom_tile(aes(x = direction, y = name, fill= -log10(P.value))) +
geom_text(aes(x = direction, y = name, label = round(Odds.Ratio,2))) +
scale_fill_gradient(low = "white", high="red") + ylab("")+xlab("") +
theme(
# Remove panel border
panel.border = element_blank(),
# Remove panel grid lines
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# Remove panel background
panel.background = element_blank(),
axis.text.y = element_markdown())
ggsave(filename = file.path(plot_path, "DE.celltypes.Heatmap.pdf"), width=6, height=12)
p
